#Instructions: this code is divided into three parts; 
# The first takes georeferenced data from PRIO at the 0.5' by 0.5' grid cell level and constructs ethno-political group variables
# The second takes the data set by Cederman et al. and aggregates the group-year data by political periods (as defined in the paper)
# The thid part combines the variables created by the first two sections and creates three datasets for analysis. 
# Intermediary files created by the first two parts are available in the dataverse to reduce upfront costs of replication
# Indeed, many of the aggregation functions take a lot of time to run. 
________________________________________________________________________________________________________________________

#Loading the "raw" data
ineq<-read.dta("ineq_data.dta") # data from Cederman et al.'s paper.

download.file("http://www.prio.no/upload/cscw/PRIO-GRID/csv/distance_v1_01.zip","distance_v1_01.zip")
download.file("http://www.prio.no/upload/cscw/PRIO-GRID/csv/priogrid_v1_01.zip","priogrid_v1_01.zip")
download.file("http://www.prio.no/upload/cscw/PRIO-GRID/csv/socioeco_v1_01.zip","socioeco_v1_01.zip")
download.file("http://www.prio.no/upload/cscw/PRIO-GRID/csv/physclimate_v1_01.zip","physclimate_v1_01.zip")
download.file("http://www.prio.no/upload/cscw/PRIO-GRID/csv/nordhaus_v1_01.zip","nordhaus_v1_01.zip")
download.file("http://www.prio.no/upload/cscw/PRIO-GRID/csv/geoepr_v1_01.zip","geoepr_v1_01.zip")

#Reading the datasets and subsetting relevant parts
distance<-read.table(unz("distance_v1_01.zip","distance_v1_01.csv"),sep=";",header=TRUE)
distance<-distance[,c('gid','year','capdist')]
priogrid<-read.table(unz("priogrid_v1_01.zip","priogrid_v1_01.csv"),sep=";",header=TRUE)
prio<-priogrid[,c('gid','year','gwno','cellarea')] # just gid, year, gwno - gid is the cell
socio<-read.table(unz("socioeco_v1_01.zip","socioeco_v1_01.csv"),sep=";",header=TRUE)
socio<-socio[,c('gid','year','pop')] # pop only for 1990 onwards, changing every five years
terrain<-read.table(unz("physclimate_v1_01.zip","physclimate_v1_01.csv"),sep=";",header=TRUE)
terrain<-terrain[terrain$year==2000,c('gid','year','mnt')]
nordhaus<-read.table(unz("nordhaus_v1_01.zip","nordhaus_v1_01.csv"),sep=";",header=TRUE)
nordhaus<-nordhaus[,c('gid','gcppc90','gcppc95','gcppc00','gcppc05')]
colnames(nordhaus)<-c('gid','gcppc.1990','gcppc.1995','gcppc.2000','gcppc.2005')
nordhauslong<-reshape(nordhaus, direction="long",varying=list(2:5),v.names="gcppc", timevar="year",idvar="gid",times=c("1990","1995","2000","2005"))
nordhauslong$year<-as.numeric(nordhauslong$year)
groups<-read.table(unz("geoepr_v1_01.zip","geoepr_v1_01.csv"),sep=";",header=TRUE)
grp<-groups[,c('gid','year','grpid')] # gid, year, grpid


#Make a count of groups present per cell (overlap of politically relevant groups in space)
groupcounts<-aggregate(grp$grpid,by=list(grp$gid,grp$year),length) #this is for cells that are covered by the political data set, change studyunits to groups for a version that covers all priogrid celsl
colnames(groupcounts)<-c("gid","year","count")
grp<-join(grp,groupcounts,by=c("gid","year"))

#trimming grid version of geoepr by subsetting same observations as in the ineq dataset
ineq_obs<-ineq[,c(1,2,3)]
colnames(ineq_obs)<-c("grpid","year","gwno")
studyunits<-join(ineq_obs,grp,by=c("grpid","year"),type="inner")  # This is a dataset to readily have grpid, gwno, year, and gid matched for those units included in the ineq dataset
#there are 5 countries from ineq that get dropped even though ineq is the less includedusive data set, eventually check which ones.
#adding a field for the number of groups in each ctry

#make a count of groups present per ctry
num.grp<-aggregate(grpid~gwno+year,data=studyunits,function(x) length(unique(x)))
colnames(num.grp)<-c("gwno","year","num.grp")

#Making full files to run code
fullset<-join(studyunits,prio,by=c("gid","year","gwno")) # this adds cell area

# Variables to create by aggregation: distribution of groups in space, economic variable(s), distance from capital, mountainous terrain
# Two useful functions for aggregation
# Simple sum over a factor
denom<-function(x,wei.var,double.count='NULL'){ ifelse(double.count=='NULL',sum(x[,wei.var],na.rm=TRUE),sum(x[,wei.var]/x[,double.count],na.rm=TRUE))} #denominator of a weighing function
# CAUTION: ONLY CHANGE DOUBLE COUNT SETTING IF EVALUATING FUNCTION PARTITIONS (GRID OR ctry). THIS IS WHEN MULTIPLE COUNTING IS AN ISSUE. IF EVALUTING FOR GROUPS, LEAVE NULL.
#the double count is for computations done by grid cell instead of by group -> the grid cells will be double counted because different groups are attributed the same value
# Weighted mean # this is to make weighted means of 'arg' using 'wei.var'
wei.mean<-function(x,arg,wei.var,double.count='NULL'){
  x<-x[which(is.na(x[,arg])==FALSE),]
  x<-x[which(is.na(x[,wei.var])==FALSE),]
  if(double.count=='NULL') {x<-x} else {x<-x[which(is.na(x[,double.count])==FALSE),]}
  ifelse(double.count=='NULL',sum(x[,arg]*x[,wei.var]/denom(x,wei.var,double.count),na.rm=TRUE),sum((x[,arg]*x[,wei.var]/x[,double.count])/denom(x,wei.var,double.count)))
} 

# Variable 1: DISTANCE from capital- requires capdist, gid, cell area, year, grpid, gwno, pop (for the population avg version)
#Create needed file
u<-fullset # here put any dataset that subsets the desired analysis (for example, to test out the code, can put a subset of the data)
fulldist<-join(u,distance,by=c("gid","year"))


#weighted by cell area only
#These are thresholds defining zones within a country as a function of the distance relative to maximum distance to capital
zonebreaks<-function(x){
  break1<-max(x$capdist)/3
  break2<-break1*2
  out<-data.frame(cbind(break1=break1,break2=break2))}  
breaks<-ddply(fulldist,.(gwno,year),zonebreaks)
fulldist[,c("break1","break2")]<-join(fulldist,breaks,by=c("gwno","year"))[,c('break1','break2')]
fulldist$zone<-(fulldist$capdist<fulldist$break1)*1+(fulldist$capdist>fulldist$break1&fulldist$capdist<fulldist$break2)*2+(fulldist$capdist>fulldist$break2)*3

grp.avg.zone<-ddply(fulldist,.(grpid,year),wei.mean,arg='zone',wei.var='cellarea')
colnames(grp.avg.zone)<-c('grpid','year','avg.zone')
write.csv(grp.avg.zone,"grp_avg_zone.csv")
grp.avg.dist<-ddply(fulldist,.(grpid,year),wei.mean,arg='capdist',wei.var='cellarea')
colnames(grp.avg.dist)<-c('grpid','year','avg.capdist')
write.csv(grp.avg.dist,"grp_avg_dist.csv")


# Variable 2: HOMOGENEITY: how ethnically homogeneous are homelands of different groups?
#Measure 1: homogeneity. requires num.groups, count (from groupcounts) , grpid, gid, gwno
#Making needed file
u<-fullset # here put any data set you wish to do the analysis for
fullcount<-join(u,num.grp,by=c("gwno","year"))


mixing<-function(x) {
  sum((1-x$count/x$num.grp)*x$cellarea/denom(x,wei.var='cellarea'))/sum((1-1/x$num.grp)*x$cellarea/denom(x,wei.var='cellarea'))} # weighted by area
seg<-ddply(fullcount,.(grpid,year), mixing)
colnames(seg)<-c("grpid","year","seg")
write.csv(seg,"seg.csv")


#measure 2: %space shared with included group. Needs the info on who is included and who is excluded (possibility to change definition of excluded and dominant)
political<-ineq[,c(1,2,3,5,6,7,8,9,10)]  
colnames(political)<-c("grpid","year","gwno",colnames(ineq)[c(5,6,7,8,9,10)]) # making names of identification variables consistent across the two sets of data sets
political$included<-ifelse(political$excluded==1,0,1)
political<-join(u, political[,c('grpid','year','gwno','excluded','included','discrim','powerless')], by=c("grpid","year","gwno"))

count.included<-aggregate(included~gid+year,data=political,function(x) sum(x==1))
colnames(count.included)<-c("gid","year","count.included")
political<-join(political,count.included,by=c("gid","year"))
share.included<-ddply(political,.(grpid,year),function(x) sum(as.numeric(x$count.included>0)*x$cellarea)/denom(x,wei.var='cellarea'))
colnames(share.included)<-c("grpid","year","share.included")
write.csv(share.included,"share_included.csv")

# Variable 3: ECONOMIC - needs pop, gcppc, count (per cell), gid, grpid, gwno, year, included, excluded
u<-fullset  # here put whichever subset you want to analyze
econ<-join(u,nordhauslong,by=c("gid","year"))
econ<-econ[which(econ$year %in% c('1990','1995','2000','2005')),]
socio<-socio[which(socio$year %in% c('1990','1995','2000','2005')),]
pol<-political[which(political$year %in% c('1990','1995','2000','2005')),c('grpid','year','gwno','excluded','included')]
se<-join(econ,socio,by=c("gid","year"))
sepol<-join(se,pol,by=c("grpid","year","gwno"))
sepol$pop<-sepol$pop/sepol$count

#Measure 1: avg wlth per group
avg.wlth<-ddply(se,.(grpid,year),wei.mean,arg='gcppc',wei.var='pop')
colnames(avg.wlth)<-c('grpid','year','group.wlth')

	#ratio of wlth to that of mean ctry wlth

u.single<-unique(u[,c('gwno','grpid','year')])
avg.wlth<-join(avg.wlth,u.single,by=c("grpid","year"),type="inner")  # extracting single measure for each year and group
	# population weights for each group relative to country population
wts.all<-ddply(sepol,.(grpid,year),function(x) sum(x$pop,na.rm=TRUE)/sum(as.numeric(sepol$pop[sepol$gwno==x$gwno[1]&sepol$year==x$year[1]]),na.rm=TRUE))
colnames(wts.all)<-c("grpid","year","wts.all")
avg.wlth<-join(avg.wlth,wts.all,by=c("grpid","year"))


ctry.avg<-ddply(avg.wlth,.(gwno,year),function(x) sum(x$group.wlth*x$wts.all)) # country average is a population weighted mean of the population weighted mean group wealth
colnames(ctry.avg)<-c("gwno","year","ctry.avg")
ratio.to.ctry<-ddply(avg.wlth,.(grpid,year), function(x) x$group.wlth[1]/ctry.avg$ctry.avg[ctry.avg$gwno==x$gwno[1]&ctry.avg$year==x$year[1]])
colnames(ratio.to.ctry)<-c("grpid","year","ratio.to.ctry")
write.csv(ratio.to.ctry,"ratio.to.ctry.csv")

# Measure 2: ratio of wlth to that of included groups - needs file merged with political
wts.incl<-ddply(sepol,.(grpid,year),function(x) ifelse(x$included[1]==1,sum(x$pop,na.rm=TRUE)/sum(as.numeric(sepol$pop[sepol$gwno==x$gwno[1]&sepol$included==1&sepol$year==x$year[1]]),na.rm=TRUE),NA)) # this is to get the ratio of included groups' total population to that of the ctry
colnames(wts.incl)<-c('grpid','year','wts.incl')
avg.wlth<-join(avg.wlth,wts.incl,by=c("grpid","year"))
avg.wlth<-join(avg.wlth,unique(sepol[,c('grpid','year','included','excluded')]),by=c("grpid","year")) # here we add a column saying whether the group is included
	#for each ctry and year, this is the population weighted mean wlth of politically included populations
avg.incl.wlth<-ddply(avg.wlth,.(gwno,year), function(x) sum(x$group.wlth[x$included==1]*x$wts.incl[x$included==1]))
colnames(avg.incl.wlth)<-c("gwno","year","avg.incl.wlth")
ratio.ei<-ddply(avg.wlth,.(grpid,year),function(x) ifelse(x$excluded[1]==1,x$group.wlth[1]/avg.incl.wlth$avg.incl.wlth[avg.incl.wlth$gwno==x$gwno[1]&avg.incl.wlth$year==x$year[1]],NA))
colnames(ratio.ei)<-c("grpid","year","ratio.ei")  # this is the ratio of group wlth to mean wlth of politically included populations, only computed for excluded group
write.csv(ratio.ei,"ratio.ei.csv")


# Variable 4: Difference between within-group and between-group GINIS (Optional as we did not end up making major use of this in our analysis)
#within group GINIS - to get a sense of whether groups have more economic homogeneity than countries or other regional units 
#group ginis
#wts<-dlply(se,.(grpid,year), function(x) x$pop/sum(x$pop,na.rm=TRUE))  # population weights for each cell of a given group area
#wts.ctry<-ddply(se,.(gid,year),function(x) x$pop/sum(se$pop[se$gwno==x$gwno[1]&se$year==x$year[1]],na.rm=TRUE)) # population weights for each cell of a given country
#x$pop/x$count/sum(x$pop/x$count,na.rm=TRUE)
library(reldist)
gini.within<-function(x){
  gini(na.omit(x$gcppc))  #how does it manage to compute since some gcppc values are missing but the weights vector is not adjusted for that? check the results are not completely off.
}
#,wts[[paste(x$grpid,x$year,sep=".")[1]]]
group.ginis<-ddply(econ,.(grpid,year),gini.within)
colnames(group.ginis)<-c("grpid","year","group.ginis")
#Country Within group Gini

se$cell<-rep(1,nrow(se))
se$cell<-se$cell/se$count  # this is to correct for cells being counted twice
within.wts<-function(x){
  (mean(x$gcppc,na.rm=TRUE)*sum(x$cell)^2)/(mean(se$gcppc[se$gwno==x$gwno[1]&se$year==x$year[1]],na.rm=TRUE)*sum(se$cell[se$gwno==x$gwno[1]&se$year==x$year[1]])^2)
  }
within.wts<-ddply(se,.(grpid,year),within.wts)
colnames(within.wts)<-c("grpid","year","within.wts")
temp<-join(within.wts,group.ginis,by=c("grpid","year"))
temp2<-join(temp,u.single[which(u.single$year %in% c('1990','1995','2000','2005')),],by=c("grpid","year"))
within.ginis<-ddply(temp2,.(gwno,year),function(x)  sum(x$within.wts*x$group.ginis))
colnames(within.ginis)<-c("gwno","year","within.ginis")
#Country Ginis
gini.ctry<-function(x){
  gini(na.omit(x$gcppc))
}
#,wts.ctry[[paste(x$gwno,x$year,sep=".")[1]]]
ctry.ginis<-ddply(se,.(gwno,year),gini.ctry)
colnames(ctry.ginis)<-c("gwno","year","ctry.ginis")
#between-group ginis
avg.cell.wlth<-ddply(se,.(gid,grpid,year,gwno),function(x) mean(se$gcppc[se$grpid==x$grpid[1]&se$year==x$year[1]],na.rm=TRUE))
colnames(avg.cell.wlth)<-c("gid","grpid","year","gwno","avg.cell.wlth")
avg.cell.wlth<-join(avg.cell.wlth,se[,c("gid","grpid","year","cell")])

write.csv(avg.cell.wlth,"avg.cell.wlth.csv")
gini.between<-function(x){
  gini(na.omit(x$avg.cell.wlth),x$cell)
}
between.group.ginis<-ddply(avg.cell.wlth,.(gwno,year),gini.between )
colnames(between.group.ginis)<-c("gwno","year","between.group.ginis")
R<-cbind(ctry.ginis,between.group.ginis,within.ginis)
R$diff<-R$between.group.ginis-R$within.ginis
R$between<-ifelse(R$diff>0.01,1,0)
write.csv(R,"R.csv")

# Variable 5 : Mountainous Terrain
#different ways of getting the right data
mnt<-join(u,terrain,by=c("gid"))
grp.avg.mnt<-ddply(mnt,.(grpid,year),wei.mean,arg='mnt',wei.var='cellarea')
colnames(grp.avg.mnt)<-c('grpid','year','avg.mnt')
write.csv(grp.avg.mnt,"grp.avg.mnt.csv")

______________________________________________________________________________________________________________________________________________________________________________________________
# TimeTransformation

#Transforming the initial dataset of Cederman et al. 2011 (ethnic groups and their relationship to the state) to different status quo periods in order to aggregate observation by political periods
# A political period is marked by any change in the political status quo (a change in any group's status or a change in regime type)
library(foreign)
library(plyr)
rawdata<-read.dta("ineq_data.dta")
rawdata<-rawdata[order(rawdata$cowcode,rawdata$cowgroupid,rawdata$year),]
data<-rawdata[,c('cowgroupid','year','cowcode','excluded','powerless','discrim','exclgrps','anocl','democl','onset_ra')] 
attach(data)
groupids<-unique(cowgroupid)
countryids<-unique(cowcode)
years<-unique(year)
# this loop marks a one for each group-year marked by a change in exclusion and in regime
for (j in 1:length(groupids)){
  data$change.in.exclusion[cowgroupid==groupids[j]]<-c(0,abs(diff(excluded[cowgroupid==groupids[j]],1)))
  data$change.in.regime[cowgroupid==groupids[j]]<-c(0, as.numeric(abs(diff(anocl[cowgroupid==groupids[j]],1))+abs(diff(democl[cowgroupid==groupids[j]],1))>0))}
data$change.in.exclusion[is.na(data$change.in.exclusion)==TRUE]<-0 # This step must be replaced by imputation - not done because of the emphasis on combining this data with other datasets in the first cut at this project
data$change.in.regime[is.na(data$change.in.regime)==TRUE]<-0 # same as above

#This loop is to code each group-year as being one in which a political change happened anywhere in the polity (using the variables above)
for (j in 1:length(countryids)){
  for (k in 1:length(years)){  
    data$pol.change[cowcode==countryids[j] & year==years[k]]<-ifelse(('1' %in% data$change.in.regime[cowcode==countryids[j] & year==years[k]])|('1' %in% data$change.in.exclusion[cowcode==countryids[j] & year==years[k]]),1,0)
  }}
# Coding periods, their beginning, end and duration
data$period<-vector(, length=nrow(data))
data$period.start<-vector(, length=nrow(data))
data$period.end<-vector(, length=nrow(data))
for (j in 1:length(groupids)){
  beg<-min(year[cowgroupid==groupids[j]])
  end<-max(year[cowgroupid==groupids[j]])
  for (h in beg:end){
    data$period[cowgroupid==groupids[j]&year==h]<-sum(data$pol.change[cowgroupid==groupids[j]&year<=h])+1
}}

for (j in 1:length(groupids)){
  beg<-min(year[cowgroupid==groupids[j]])
  end<-max(year[cowgroupid==groupids[j]])
  for (h in beg:end){
    data[cowgroupid==groupids[j]&year==h,c('period.start','period.end')]<-range(year[cowgroupid==groupids[j]&data$period==data$period[cowgroupid==groupids[j]&year==h][1]])}}
data$period.length<-data$period.end-data$period.start   
  
detach(data)
attach(data) 
#counting the number of unique units (group-periods)
myfun<-function(x){length(unique(x))}
units<-sum(aggregate(period,by=list(cowgroupid),FUN=myfun)[,2])  #2632 units
data$unitid<-data$cowgroupid+data$period  # Creates a unique id for each group-period observation
  
# aggregating the outcome variable
detach(data)
attach(data) 
summary.rebellion<-aggregate(onset_ra,by=list(unitid),FUN=sum,na.rm=TRUE)
colnames(summary.rebellion)<-c("unitid","rebellions")
data<-merge(data,summary.rebellion,by=c("unitid"))
data$rebel<-as.numeric(data$rebellions>0) #Making it a binary variable


#now remove useless variables
data2<-data[,c('unitid','cowgroupid','year','cowcode','excluded','powerless','discrim','exclgrps','anocl','democl','onset_ra','period','period.start','period.length','rebel')]
# add some variables
data3<-join(data2,rawdata[,c("cowgroupid","year","eeurop","lamerica","ssafrica","asia","lgdpcapl","b")],type="inner")

#change names for compatibility with other data
colnames(data3)<-c('unitid','grpid','year','gwno','excluded','powerless','discrim','exclgrps','anocl','democl','onset_ra','period','period.start','period.length','rebel',"eeurop","lamerica","ssafrica","asia","lgdpcapl","b")

write.csv(data3,"timeaggregated_data.csv")


______________________________________________________________________________________________________________________________________________________________________________________________
# This page of code should be run before the analysis 
# Input files required here are available for convenience or can be created by running the two first parts of this code, entitled
#TimeTransformation and CodePrioGridData respectively. TimeTransformation aggregates group-year observations into group- political period observations from ineq dataset and CodePrioGridData creates group level variables from  Priogrid data


#Trimming and Combining our time and group aggregated datasets (timeaggregated_data.csv from TimeTransformation.R and the eight datafiles of group characteristics generated by CodePrioGridData.R)
trim<-function(df,id){df[which(duplicated(df[,id])==FALSE),] } # this is a function to trim the dataset with aggregated values (keeping group-periods rather than cell-years or group years)
#Trimming the time dataset
timeag<-read.csv("timeaggregated_data.csv")
timeag<-timeag[timeag$excluded==1,] # Attention: this is for the question where we estimate a causal effect for excluded groups: any other subset is possible at this stage to address another question 
timeag<-timeag[-which(is.na(timeag$unitid)==TRUE),]  # there seems to be a bug in the code of timetransformation creating spurious rows of NAs. This is to get rid of them until we find the problem and fix it. Should not affect results
timeag$X<-NULL # clean this column up if it exists in the CSV file 
extimeag<-ddply(timeag,.(unitid),trim,id='unitid')

#Making groupag - we want to join all group aggregated variables constructed from prio data: grp_avg_dist,grp_avg_zone, mnt, ratio.ei, ratio.to.ctry, seg, share_included
grp_avg_dist<-read.csv("grp_avg_dist.csv")
grp_avg_zone<-read.csv("grp_avg_zone.csv") 
grp.avg.mnt<-read.csv("grp.avg.mnt.csv")
ratio.ei<-read.csv("ratio.ei.csv") 
ratio.to.ctry<-read.csv("ratio.to.ctry.csv") 
seg<-read.csv<-read.csv("seg.csv") 
share_included<-read.csv("share_included.csv")

grp_avg_dist$X<-NULL
grp_avg_zone$X<-NULL
grp.avg.mnt$X<-NULL
seg$X<-NULL
share_included$X<-NULL
ratio.ei$X<-NULL
ratio.to.ctry$X<-NULL

temp<-join(grp_avg_dist,grp_avg_zone,by=c("grpid","year"),type="inner")
temp<-join(temp,grp.avg.mnt,by=c("grpid","year"),type="inner")
temp<-join(temp,seg,by=c("grpid","year"),type="inner")
grpag<-join(temp,share_included,by=c("grpid","year"),type="inner")
rm(temp)

# Make the link file: periods
experiods<-extimeag[,c("unitid","grpid","year")]
allperiods<-timeag[,c("unitid","grpid","year","gwno","period")]

# joining extimeag and groupag
grpag2<-join(experiods,grpag, by=c("year","grpid"),type="inner")  #this is joining the file that grpperio IDs, the link to year, period and grpid from the raw data
grpag.per<-ddply(grpag2,.(unitid),trim,id='unitid') #this should give us all the group aggregated variables for each political period. The default here is to take the first value if there is variation within the period. We could do mean instead
grpdataset<-join(extimeag,grpag.per,by=c("unitid","grpid","year"),type="inner") # this should subset grpag only to excluded groups since extimeag only has excluded groups. it should also not change the number of observations.
#To check it went as planned, check that year and period.start are the same.
grpdataset$year<-NULL
grpdata$grpid<-NULL
grpdata$gwno<-NULL
grpdata$excluded<-NULL
grpdata$onset_ra<-NULL
grpdata$period<-NULL

#creating lags - NOTE: too many lags end up being NAs. Next stage is to impute first. Also note that these past period lags should be weighted duration of periods.
grpdataset[,'lag.democ']<-ddply(grpdataset,.(unitid),function(x)  ifelse(x$period==1,NA,ifelse(grpdataset$democl[grpdataset$period==x$period-1&grpdataset$gwno==x$gwno]==1,1,0)))[,2]
grpdataset[,'lag.anoc']<-ddply(grpdataset,.(unitid),function(x)  ifelse(x$period==1,NA,ifelse(grpdataset$anocl[grpdataset$period==x$period-1&grpdataset$gwno==x$gwno]==1,1,0)))[,2]
grpdataset[,'lag.excluded']<-ddply(grpdataset,.(unitid),function(x)  ifelse(x$period==1,NA,ifelse(grpdataset$excluded[grpdataset$period==x$period-1&grpdataset$grpid==x$grpid]=='NA',NA,ifelse(grpdataset$excluded[grpdataset$period==x$period-1&grpdataset$grpid==x$grpid]==1,1,0))))[,2]
grpdataset[,'lag.conflict']<-ddply(grpdataset,.(unitid),function(x)  ifelse(x$period==1,NA,ifelse(sum(grpdataset$rebel[grpdataset$period<x$period&grpdataset$gwno==x$gwno],na.rm=TRUE)>0,1,0)))[,2]
write.csv(grpdataset,"grpdataset.csv")


#Putting together variables for the country-level dataset: first needs averaging of group-level variables
#variables needing averaging (seg, excluded, discriminated/powerless)

grp.area.wts<-ddply(u[,c('grpid','year','gwno','cellarea')],.(grpid,year), function(x) sum(x$cellarea,na.rm=TRUE)/sum(u$cellarea[u$gwno==x$gwno[1] & u$year==x$year[1]],na.rm=TRUE),.progress="text")
colnames(grp.area.wts)<-c("grpid","year","grp.area.wts")
# here enter a dataframe with gwno, grpid, cellarea but also excluded
#If the dataset exgrp.area.wts already exist, better to read it in because the next three lines take over an hour
exgrp.area.wts<-ddply(political[,c('grpid','year','gwno','cellarea','excluded')],.(grpid,year), function(x) ifelse(x$excluded[1]==1,sum(x$cellarea,na.rm=TRUE)/sum(political$cellarea[political$gwno==x$gwno[1] & political$year==x$year[1]&political$excluded==1],na.rm=TRUE),NA),.progress="text")
colnames(exgrp.area.wts)<-c("grpid","year","exgrp.area.wts")
write.csv(exgrp.area.wts,"exgrp.area.wts.csv")

temp<-join(grpag,ratio.ei,type="left")
temp<-join(temp,ratio.to.ctry,type="left")
temp<-join(temp,grp.area.wts,type="inner")
temp<-join(temp,exgrp.area.wts,type="inner")
temp<-join(temp,timeag,type="inner")
#weighted averages for variables defining characteristics of excluded groups
ctry.avg.exgrp.var<-function(x){
x1<-sum(x[x$excluded==1,"avg.capdist"]*x[x$excluded==1,c("exgrp.area.wts")],na.rm=TRUE)
x2<-sum(x[x$excluded==1,"avg.zone"]*x[x$excluded==1,c("exgrp.area.wts")],na.rm=TRUE)
x3<-sum(x[x$excluded==1,"avg.mnt"]*x[x$excluded==1,c("exgrp.area.wts")],na.rm=TRUE)
x4<-sum(x[x$excluded==1,"seg"]*x[x$excluded==1,c("exgrp.area.wts")],na.rm=TRUE)
x5<-sum(x[x$excluded==1,"share.included"]*x[x$excluded==1,c("exgrp.area.wts")],na.rm=TRUE)
x6<-sum(x[x$excluded==1,"b"]*x[x$excluded==1,c("exgrp.area.wts")],na.rm=TRUE)
x7<-sum(x[x$excluded==1,"rebel"],na.rm=TRUE)
out<-data.frame(cbind(exavg.capdist=x1,exavg.zone=x2,exavg.avg.mnt=x3,exavg.seg=x4,exavg.share.included=x5,exavg.b=x6,rebel.excl=x7))
}
ctry.avg.exgrp.var<-ddply(temp,.(gwno,year), ctry.avg.exgrp.var)

#weighted averages for economic  variables defining characteristics of excluded groups - 1990 to 2005
ctry.avg.exgrp.var2<-function(x){
  x1<-sum(x[x$excluded==1,"ratio.ei"]*x[x$excluded==1,c("exgrp.area.wts")],na.rm=TRUE)
  x2<-sum(x[x$excluded==1,"ratio.to.ctry"]*x[x$excluded==1,c("exgrp.area.wts")],na.rm=TRUE)
  out<-data.frame(cbind(exavg.ratio.ei=x1,exavg.to.ctry=x2))
}
ctry.avg.exgrp.var2<-ddply(temp[temp$year %in% c('1990','1995','2000','2005'),],.(gwno,year), ctry.avg.exgrp.var2)

#weighted averages for variables defining characteristics of all groups
ctry.avg.allgrp.var<-function(x){
  x1<-sum(x[,"excluded"]*x[,c("grp.area.wts")],na.rm=TRUE)
  x2<-sum(x[,"powerless"]*x[,c("grp.area.wts")],na.rm=TRUE)
  x3<-sum(x[,"discrim"]*x[,c("grp.area.wts")],na.rm=TRUE)
  x4<-sum(x[,"seg"]*x[,c("grp.area.wts")],na.rm=TRUE)
  x5<-sum(x[,"rebel"],na.rm=TRUE)
  x6<-sum(x[x$excluded==0,"rebel"],na.rm=TRUE)
  out<-data.frame(cbind(share.excl=x1,share.pwless=x2,share.discrim=x3,allavg.seg=x4,rebel.all=x5,rebel.incl=x6))
}
ctry.avg.allgrp.var<-ddply(temp,.(gwno,year), ctry.avg.allgrp.var)
#counting number of power-sharing groups
inclgrps<-ddply(temp,.(gwno,year),function(x) length(unique(x$grpid[x$excluded==0])))
colnames(inclgrps)<-c("gwno","year","inclgrps")
# Making ctryag: all non-econ ctry level variables with gwno, year from 1946-2005 
# First redefining periods to be compatible with country
ctry.obs<-ddply(timeag[,c('gwno','year','period')],.(gwno,year),function(x) ctryperiod<-max(x$period))
colnames(ctry.obs)<-c("gwno","year","ctryperiod")
ctry.obs<-unique(ctry.obs[,c('gwno','year','ctryperiod')])
ctry.obs<-join(ctry.obs,ctry.avg.exgrp.var,by=c("gwno","year"),type="inner")
ctry.obs<-join(ctry.obs,ctry.avg.allgrp.var,by=c("gwno","year"))
ctry.obs<-join(ctry.obs,inclgrps,by=c("gwno","year"),type="left")
ctry.obs<-join(ctry.obs,temp[,c('gwno','year','anocl','democl','period.start','period.length','eeurop','lamerica','ssafrica','asia','lgdpcapl')],type="inner")
#trimming
ctry.obs$ctryunitid<-ctry.obs$gwno+ctry.obs$ctryperiod
ctrydataset<-ddply(ctry.obs,.(ctryunitid),trim,id='ctryunitid')
write.csv(ctrydataset,"ctrydataset.csv")

# Making ctryag with econ variables for years 1990-2005: all ctry level variables with gwno, year. Merge 
ctry.obs<-ddply(timeag[,c('gwno','year','period')],.(gwno,year),function(x) ctryperiod<-max(x$period))
colnames(ctry.obs)<-c("gwno","year","ctryperiod")
ctry.obs<-unique(ctry.obs[ctry.obs$year>=1990,c('gwno','year','ctryperiod')])
ctry.obs<-join(ctry.obs,ctry.avg.exgrp.var,by=c("gwno","year"),type="inner")
ctry.obs<-join(ctry.obs,ctry.avg.exgrp.var2,by=c("gwno","year"))
ctry.obs<-join(ctry.obs,ctry.avg.allgrp.var,by=c("gwno","year"))
ctry.obs<-join(ctry.obs,inclgrps,by=c("gwno","year"),type="left")
Rtemp<-join(R[,c('gwno','year','between')],ctry.obs[,c("year","ctryperiod","gwno")],type="inner")
ctry.obs<-join(ctry.obs,Rtemp[,c('gwno','ctryperiod','between')],by=c("gwno","ctryperiod"),type="left")
ctry.obs<-join(ctry.obs,temp[,c('gwno','year','anocl','democl','period.start','period.length','eeurop','lamerica','ssafrica','asia','lgdpcapl')],type="inner")
#trimming
ctry.obs$ctryunitid<-ctry.obs$gwno+ctry.obs$ctryperiod
ctrydataset2<-ddply(ctry.obs,.(ctryunitid),trim,id='ctryunitid')
ctrydataset2[,'lag.democ']<-ddply(ctrydataset2,.(ctryunitid),function(x)  ifelse(x$ctryperiod==1,NA,ifelse(ctrydataset2$democl[ctrydataset2$ctryperiod==x$ctryperiod-1&ctrydataset2$gwno==x$gwno]==1,1,0)))[,2]
ctrydataset2[,'lag.anoc']<-ddply(ctrydataset2,.(ctryunitid),function(x)  ifelse(x$ctryperiod==1,NA,ifelse(ctrydataset2$anocl[ctrydataset2$ctryperiod==x$ctryperiod-1&ctrydataset2$gwno==x$gwno]==1,1,0)))[,2]
ctrydataset2[,'lag.conflict']<-ddply(ctrydataset2,.(ctryunitid),function(x)  ifelse(x$ctryperiod==1,NA,ifelse(sum(ctrydataset2$rebel.all[ctrydataset2$ctryperiod<x$ctryperiod&ctrydataset2$gwno==x$gwno],na.rm=TRUE)>0,1,0)))[,2]
write.csv(ctrydataset2,"ctrydataset2.csv")